Purpose
Introduction
Setup
Begin by downloading and opening the following packages into your library. The gelnet library will be used to train the model, dplyr and gdata are used for data manipulation.
library(gelnet)
library(dplyr)
library(gdata)
Data
The the pcbc data is organized in a data frame with 99 samples as columns and 219 methylation probes as rows.
load("pcbc.data.Rda")
pcbc.data
load("pcbc.pd.f.Rda")
head(pcbc.pd.f)
Then find the mean center by subtracting the mean of each probe from the entire pcbc data. The mean of each probe just be in a numeric vector the same size as the number of probes, in this case 219.
m <- apply(pcbc.data, 1, mean)
m[1:5]
cg02927655 cg15948871 cg17676824 cg25552705 cg21434327
0.4130770 0.2652034 0.3821945 0.4168715 0.3776927
pcbc.data.2 <- pcbc.data - m
pcbc.data.2
Identify stem cells and break up all samples into 2 groups:
- Stem cell (‘X.tr’ object)
- not stem cell (‘X.bk’ object)
# Define PCBC groups (SC and non.SC)
M1_smp <- pcbc.pd.f[pcbc.pd.f$Diffname_short %in% "SC",] #SC
M2_smp <- pcbc.pd.f[!(pcbc.pd.f$Diffname_short %in% "SC"),] #non-SC
# Select PCBC data
X.tr <- pcbc.data.2[, as.character(M1_smp$UID)] # 44 samples
X.bk <- pcbc.data.2[, as.character(M2_smp$UID)] # 55 samples
Now we can begin to train the the one-class model with the gelnet function. The gelnet function can be used for Linear Regression, Binary Classification and One class Problems by using an iterative method called coordinated descent (Sokolov et al. 2016).
It has four main arguments described below:
- X: n by p matrix => transpose(‘X.r’)
- y: ‘NULL’ for one class models
- l1: coefficient for the L1-norm penalty => ‘0’
- l2: coefficient for the L2-norm penalty => ‘1’
Make sure you transpose the matrix so that the genes are listed as rows and samples as columns. Then store the results as a tsv file (pcbc-stemsig.p219.rda).
## Train a one-class model
mm <- gelnet(t(X.tr), NULL, 0, 1) #NULL for a one-class task
Training a one-class model
Iteration 1 : f = 0.6931472
Iteration 2 : f = 0.3320004
Iteration 3 : f = 0.3263175
## Store the signature to a file
save( mm, file = "pcbc-stemsig.p219.Rda")
Leave One Out Cross-Validation
To test how the model’s performance by using leave one out cross-validation. This process has three steps:
- Train model on non-left-out data
- Score the left-out sample against the background
- AUC = P( left-out sample is scored above the background )
# Cross-validation with linear model:
# Perform leave-one-out cross-validation
auc <- c()
for(i in 1:ncol(X.tr)) {
## Train a model on non-left-out data
X1 <- X.tr[,-i]
X1 <- as.matrix(X1)
K <- t(X1) %*% X1 / nrow(X1)
m1 <- gelnet.ker(K, NULL, lambda = 1)
w1 <- X1 %*% m1$v
## Score the left-out sample against the background
X.bk <- X.bk[rownames(X.tr),]
X.bk <- as.matrix(X.bk)
s.bk <- t(w1) %*% X.bk
s.bk <- unmatrix(s.bk)
s1 <- t(w1) %*% X.tr[,i]
s1 <- unmatrix(s1)
## AUC = P( left-out sample is scored above the background )
auc[i] <- sum(s1 > s.bk) / length(s.bk)
cat( "Current AUC: ", auc[i], "\n" )
cat( "Average AUC: ", mean(auc), "\n" )
}
If the validation is successful you will notice that the auc variable will be a numeric vector consisting of only 1’s.
r1:c1
4.730943
[1] 1 1 1 1 1 1
[1] TRUE
Replace NA
The Replace NA function is used to replace any values that are NA (not available) with either the mean or the median value of the probe for a give group.
- check for NA values
- locate the NA values
- calculate the mean or median for the probe where each NA is found
- replace the values
replace.NA <-function(data,type.info,by = "mean"){
if(!"group" %in% colnames(type.info)) stop("type.info must have group column")
if(!"sample" %in% colnames(type.info)) stop("type.info must have a sample column")
# Do we have NAs?
if(is.na(table(is.na(data))["TRUE"])){
message("No NAs were found")
return(data)
}
# get NAs index
idx <- which(is.na(data) == TRUE,arr.ind=TRUE)
count <- table(rownames(idx))
message("======= Status Number of NA in probes ========")
message("--------------------- Summary------------------")
print(summary(as.numeric(count)))
message("\n----------- Probes with more nb of NAs -----------")
print(head(sort(count,decreasing = T)))
message("===============================================")
idx <- cbind(idx, mean = NA, median = NA)
# For each NA value calculate the mean for the same probe for the samples
# where it belongs
for(line in 1:nrow(idx)){
row <- idx[line,1]
col <- idx[line,2]
probe <- rownames(idx)[line]
sample <- colnames(data)[col]
group <- type.info[type.info$sample == sample,"group"]
samples.in.group <- type.info[type.info$group == group,]$sample
# get the probe value for all samples in the same group
aux <- data[rownames(data) %in% probe, colnames(data) %in% samples.in.group]
idx[line,3] <- mean(as.numeric(aux),na.rm = TRUE)
idx[line,4] <- median(as.numeric(aux),na.rm = TRUE)
}
# Step 2 replace
for(line in 1:nrow(idx)){
row <- idx[line,1]
col <- idx[line,2]
if(by == "mean"){
data[idx[line,1],idx[line,2]] <- idx[line,3]
} else if(by == "median") {
data[idx[line,1],idx[line,2]] <- idx[line,4]
}
}
return(data)
}
Score PanCan33 Data
Use the signature that was created and stored as ‘pcbc-stemsig.p219.rda’ to now score PanCan33 data. First load the data data.pan Replace empty values with the median probe values.
## Uses the signature to score PanCan33 data
# load TCGA 450K data (subset of 219 probes of interest)
load("data.pan.Rda")
data.pan
load("type.info.Rda") #(contains tumor type info)
type.info
testset <- replace.NA(data.pan, type.info, by="median")
======= Status Number of NA in probes ========
--------------------- Summary------------------
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 1.00 1.00 9.36 3.00 222.00
----------- Probes with more nb of NAs -----------
cg04876127 cg10665848 cg20290850 cg01095506 cg05056545 cg15701612
222 101 35 15 11 10
===============================================
Load the signature w which should be a numeric row vector the size of the probes of interest.
load("pcbc-stemsig.p219.Rda")
w <- mm$w
w[1:10]
cg02927655 cg15948871 cg17676824 cg25552705 cg21434327 cg06678662 cg20300315 cg12257394 cg10381520 cg04193909
-0.03251136 -0.03270579 -0.03151675 -0.04281638 -0.02921891 -0.03178537 -0.02974974 -0.05360807 -0.04052402 -0.04616716
X <- testset[as.character(names(w)),]
head(X)
Convert X into a matrix
X <- as.matrix(X)
X[1:3,1:3]
TCGA-05-4384-01A-01D-1756-05 TCGA-05-4390-01A-02D-1756-05 TCGA-05-4396-01A-21D-1856-05
cg02927655 0.0978782 0.094127 0.0970064
cg15948871 0.7006290 0.675834 0.7751590
cg17676824 0.8089120 0.815386 0.8020920
Score via linear model. The resulting variable ss will be row vector 9627 in length
ss <- t(w) %*% X
ss[1,1:5]
TCGA-05-4384-01A-01D-1756-05 TCGA-05-4390-01A-02D-1756-05 TCGA-05-4396-01A-21D-1856-05 TCGA-05-4405-01A-21D-1856-05
-4.813315 -5.012777 -4.969649 -5.112913
TCGA-05-4410-01A-21D-1856-05
-4.933056
Scale the svores into a ratio from 0 to 1. and store as data frame.
## Scale the scores to be between 0 and 1
ss <- ss - min(ss)
ss <- ss / max(ss)
ss <- as.data.frame(t(ss))
colnames(ss) <- "mDNAsi"
head(ss)
Save scores to a Rda file.
save(ss, file = "TCGA_mDNAsi.Rda")
Continuous Code
# One class model - 450K data
# Files located @ https://drive.google.com/drive/folders/0BybEcxkBv6VqcmJ0QlA3bVJhc0E?usp=sharing
# Load required libraries
library(gelnet)
library(dplyr)
library(gdata)
# load PCBC metadata (contains group info)
load("pcbc.pd.f.Rda")
# load PCBC 450K data (subset of 219 probes of interest)
load("pcbc.data.Rda")
## Mean-center the data
m <- apply(pcbc.data, 1, mean )
pcbc.data.2 <- pcbc.data - m
# Define PCBC groups (SC and non.SC)
M1_smp <- pcbc.pd.f[pcbc.pd.f$Diffname_short %in% "SC",] #SC
M2_smp <- pcbc.pd.f[!(pcbc.pd.f$Diffname_short %in% "SC"),] #non-SC
# Select PCBC data
X.tr <- pcbc.data.2[, as.character(M1_smp$UID)] # 44 samples
X.bk <- pcbc.data.2[, as.character(M2_smp$UID)] # 55 samples
## Train a one-class model
mm <- gelnet(t(X.tr), NULL, 0, 1) # NULL for a one-class task
## Store the signature to a file
save( mm, file = "pcbc-stemsig.p219.Rda")
# Cross-validation with linear model:
## Perform leave-one-out cross-validation
auc <- c()
for( i in 1:ncol(X.tr) )
{
## Train a model on non-left-out data
X1 <- X.tr[,-i]
X1 <- as.matrix(X1)
K <- t(X1) %*% X1 / nrow(X1)
m1 <- gelnet.ker(K, NULL, lambda = 1)
w1 <- X1 %*% m1$v
## Score the left-out sample against the background
X.bk <- X.bk[rownames(X.tr),]
X.bk <- as.matrix(X.bk)
s.bk <- t(w1) %*% X.bk
s.bk <- unmatrix(s.bk)
s1 <- t(w1) %*% X.tr[,i]
s1 <- unmatrix(s1)
## AUC = P( left-out sample is scored above the background )
auc[i] <- sum(s1 > s.bk) / length(s.bk)
cat( "Current AUC: ", auc[i], "\n" )
cat( "Average AUC: ", mean(auc), "\n" )
}
## Uses the signature to score PanCan33 data
# load TCGA 450K data (subset of 219 probes of interest)
load("data.pan.Rda")
# Function to replace NA values with median of probe values by tumor type
source("replaceNA.R")
load("type.info.Rda") #(contains tumor type info)
testset <- replace.NA(data.pan, type.info, by = "median")
## Load the signature
load("pcbc-stemsig.p219.Rda")
w <- mm$w
X <- testset[as.character(names(w)),]
X <- as.matrix(X)
## Score via linear model
ss <- t(w) %*% X
## Scale the scores to be between 0 and 1
ss <- ss - min(ss)
ss <- ss / max(ss)
ss <- as.data.frame(t(ss))
colnames(ss) <- "mDNAsi"
save(ss, file = "TCGA_mDNAsi.Rda")
References
Sokolov, Artem, Daniel E Carlin, Evan O Paull, Robert Baertsch, and Joshua M Stuart. 2016. “Pathway-Based Genomics Prediction Using Generalized Elastic Net.” PLoS Comput Biol 12 (3). Public Library of Science: e1004790.
---
title: "One class model - 450K data"
author: "Tathiane Maistro Malta"
output:
  html_notebook:
    highlight: tango
    theme: journal
    toc: yes
    toc_float:
      collapsed: no
  html_document:
    toc: yes
editor_options:
  chunk_output_type: inline
bibliography: bibliography.bib
---
***
# Purpose 

***

# Introduction 

# Setup 

Begin by downloading and opening the following packages into your library.
The [gelnet](https://cran.r-project.org/web/packages/gelnet/index.html) library will be used to train the model, 
[dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) and [gdata](https://cran.r-project.org/web/packages/gdata/index.html) are used for data manipulation.
```{r}
library(gelnet)
library(dplyr)
library(gdata)
```

# Data 

The the pcbc data is organized in a data frame with 99 samples as columns and 219 methylation probes as rows.
```{r}
load("pcbc.data.Rda")
pcbc.data
```

```{r}
load("pcbc.pd.f.Rda")
head(pcbc.pd.f)
```


Then find the mean center by subtracting the mean of each probe from the entire pcbc data. 
The mean of each probe just be in a numeric vector the same size as the number of probes, in this case 219.
```{r}
m <- apply(pcbc.data, 1, mean)
m[1:5]
```
```{r}
pcbc.data.2 <- pcbc.data - m
pcbc.data.2
```


Identify stem cells and break up all samples into 2 groups:

* Stem cell ('X.tr' object)
* not stem cell ('X.bk' object)

```{r}
# Define PCBC groups (SC and non.SC)
M1_smp <- pcbc.pd.f[pcbc.pd.f$Diffname_short %in% "SC",] #SC
M2_smp <- pcbc.pd.f[!(pcbc.pd.f$Diffname_short %in% "SC"),] #non-SC

# Select PCBC data
X.tr <- pcbc.data.2[, as.character(M1_smp$UID)] # 44 samples
X.bk <- pcbc.data.2[, as.character(M2_smp$UID)] # 55 samples
```

Now we can begin to train the the one-class model with the [gelnet](https://www.rdocumentation.org/packages/gelnet/versions/1.2.1/topics/gelnet) function. 
The gelnet function can be used for Linear Regression, 
Binary Classification and One class Problems by using an iterative method called 
coordinated descent [@sokolov2016pathway].

```{r, eval=FALSE}
gelnet(X, y, l1, l2)
```

It has four main arguments described below:

* **X**: n by p matrix => transpose('X.r')
* **y**: 'NULL' for one class models 
* **l1**: coefficient for the L1-norm penalty => '0' 
* **l2**: coefficient for the L2-norm penalty => '1'

Make sure you transpose the matrix so that the genes are listed as rows and samples as columns.
Then store the results as a tsv file (pcbc-stemsig.p219.rda). 
```{r}
## Train a one-class model
mm <- gelnet(t(X.tr), NULL, 0, 1) #NULL for a one-class task 

## Store the signature to a file
save( mm, file = "pcbc-stemsig.p219.Rda")
```



# Leave One Out Cross-Validation 

To test how the model's performance by using leave one out cross-validation. 
This process has three steps: 

1. Train model on non-left-out data
2. Score the left-out sample against the background
3. AUC = P( left-out sample is scored above the background )

```{r, results="hide"}
# Cross-validation with linear model:
# Perform leave-one-out cross-validation
auc <- c()
for(i in 1:ncol(X.tr)) {
  ## Train a model on non-left-out data
  X1 <- X.tr[,-i]
  X1 <- as.matrix(X1)
  K <- t(X1) %*% X1 / nrow(X1)
  m1 <- gelnet.ker(K, NULL, lambda = 1)
  w1 <- X1 %*% m1$v
  
  ## Score the left-out sample against the background
  X.bk <- X.bk[rownames(X.tr),]
  X.bk <- as.matrix(X.bk)
  s.bk <- t(w1) %*% X.bk
  s.bk <- unmatrix(s.bk)
  
  s1 <- t(w1) %*% X.tr[,i]
  s1 <- unmatrix(s1)
  
  ## AUC = P( left-out sample is scored above the background )
  auc[i] <- sum(s1 > s.bk) / length(s.bk)
  cat( "Current AUC: ", auc[i], "\n" )
  cat( "Average AUC: ", mean(auc), "\n" )
}
```


If the validation is successful you will notice that the auc variable will be a numeric vector consisting of only 1's.  
```{r}
print(s1)
```
```{r}
head(auc)
all(auc == 1)
```

# Replace NA 

The Replace NA function is used to replace any values that are NA (not available) 
with  either the mean or the median value of the probe for a give group. 

1. check for NA values
2. locate the NA values 
3. calculate the mean or median for the probe where each NA is found 
4. replace the values

```{r, message = FALSE, error = FALSE}
replace.NA <-function(data,type.info,by = "mean"){
  if(!"group" %in% colnames(type.info)) stop("type.info must have group column")
  if(!"sample" %in% colnames(type.info)) stop("type.info must have a sample column")
  
  # Do we have NAs?
  if(is.na(table(is.na(data))["TRUE"])){
    message("No NAs were found")
    return(data)
  }
  # get NAs index 
  idx <- which(is.na(data) == TRUE,arr.ind=TRUE)
  count <- table(rownames(idx))
  message("======= Status Number of NA in probes ========")
  message("--------------------- Summary------------------")
  print(summary(as.numeric(count)))
  message("\n----------- Probes with more nb of NAs -----------")
  print(head(sort(count,decreasing = T)))
  message("===============================================")
                 
  idx <- cbind(idx, mean = NA, median = NA)
  
  # For each NA value calculate the mean for the same probe for the samples
  # where it belongs
  for(line in 1:nrow(idx)){
    row <- idx[line,1]
    col <- idx[line,2]
    probe <- rownames(idx)[line]
    sample <- colnames(data)[col]
    group <- type.info[type.info$sample == sample,"group"]
    samples.in.group <- type.info[type.info$group == group,]$sample
    
    # get the probe value for all samples in the same group 
    aux <- data[rownames(data) %in% probe, colnames(data) %in% samples.in.group] 
    
    idx[line,3] <- mean(as.numeric(aux),na.rm = TRUE)
    idx[line,4] <- median(as.numeric(aux),na.rm = TRUE)
  }
  # Step 2 replace
  for(line in 1:nrow(idx)){
    row <- idx[line,1]
    col <- idx[line,2]
    if(by == "mean"){
      data[idx[line,1],idx[line,2]] <- idx[line,3]  
    } else if(by == "median") { 
      data[idx[line,1],idx[line,2]] <- idx[line,4]
    }
  }
  return(data)
}
```

# Score PanCan33 Data 

Use the signature that was created and stored as 'pcbc-stemsig.p219.rda' to now score PanCan33 data. 
First load the data data.pan
Replace empty values with the median probe values.

```{r}
## Uses the signature to score PanCan33 data
# load TCGA 450K data (subset of 219 probes of interest)
load("data.pan.Rda") 
data.pan
```
```{r}
load("type.info.Rda") #(contains tumor type info)
type.info
```
```{r}
testset <- replace.NA(data.pan, type.info, by="median") 
head(testset)
```


Load the signature w which should be a numeric row vector the size of the probes of interest. 
```{r}
load("pcbc-stemsig.p219.Rda")
w <- mm$w
w[1:10]
```


```{r}
X <- testset[as.character(names(w)),]
head(X)
```

Convert X into a matrix
```{r}
X <- as.matrix(X)
X[1:3,1:3]
```


Score via linear model. The resulting variable ss will be row vector 9627 in length 
```{r}
ss <- t(w) %*% X
ss[1,1:5]
```

Scale the svores into a ratio from 0 to 1. and store as data frame. 
```{r}
## Scale the scores to be between 0 and 1
ss <- ss - min(ss)
ss <- ss / max(ss)
ss <- as.data.frame(t(ss))
colnames(ss) <- "mDNAsi"   
head(ss)
```

Save scores to a Rda file.
```{r}
save(ss, file = "TCGA_mDNAsi.Rda")
```

# Continuous Code 

```{r,eval = FALSE}
# One class model - 450K data
# Files located @ https://drive.google.com/drive/folders/0BybEcxkBv6VqcmJ0QlA3bVJhc0E?usp=sharing

# Load required libraries
library(gelnet)
library(dplyr)
library(gdata)

# load PCBC metadata (contains group info)
load("pcbc.pd.f.Rda")
# load PCBC 450K data (subset of 219 probes of interest)
load("pcbc.data.Rda")

## Mean-center the data
m <- apply(pcbc.data, 1, mean )
pcbc.data.2 <- pcbc.data - m

# Define PCBC groups (SC and non.SC)
M1_smp <- pcbc.pd.f[pcbc.pd.f$Diffname_short %in% "SC",] #SC
M2_smp <- pcbc.pd.f[!(pcbc.pd.f$Diffname_short %in% "SC"),] #non-SC

# Select PCBC data
X.tr <- pcbc.data.2[, as.character(M1_smp$UID)] # 44 samples
X.bk <- pcbc.data.2[, as.character(M2_smp$UID)] # 55 samples

## Train a one-class model
mm <- gelnet(t(X.tr), NULL, 0, 1) # NULL for a one-class task 

## Store the signature to a file
save( mm, file = "pcbc-stemsig.p219.Rda")

# Cross-validation with linear model:
## Perform leave-one-out cross-validation
auc <- c()
for( i in 1:ncol(X.tr) )
{
  ## Train a model on non-left-out data
  X1 <- X.tr[,-i]
  X1 <- as.matrix(X1)
  K <- t(X1) %*% X1 / nrow(X1)
  m1 <- gelnet.ker(K, NULL, lambda = 1)
  w1 <- X1 %*% m1$v
  
  ## Score the left-out sample against the background
  X.bk <- X.bk[rownames(X.tr),]
  X.bk <- as.matrix(X.bk)
  s.bk <- t(w1) %*% X.bk
  s.bk <- unmatrix(s.bk)
  
  s1 <- t(w1) %*% X.tr[,i]
  s1 <- unmatrix(s1)
  
  ## AUC = P( left-out sample is scored above the background )
  auc[i] <- sum(s1 > s.bk) / length(s.bk)
  cat( "Current AUC: ", auc[i], "\n" )
  cat( "Average AUC: ", mean(auc), "\n" )
}

## Uses the signature to score PanCan33 data
# load TCGA 450K data (subset of 219 probes of interest)
load("data.pan.Rda") 
# Function to replace NA values with median of probe values by tumor type
source("replaceNA.R")
load("type.info.Rda") #(contains tumor type info)
testset <- replace.NA(data.pan, type.info, by = "median") 

## Load the signature
load("pcbc-stemsig.p219.Rda")
w <- mm$w

X <- testset[as.character(names(w)),]
X <- as.matrix(X)

## Score via linear model
ss <- t(w) %*% X
## Scale the scores to be between 0 and 1
ss <- ss - min(ss)
ss <- ss / max(ss)
ss <- as.data.frame(t(ss))

colnames(ss) <- "mDNAsi"   
save(ss, file = "TCGA_mDNAsi.Rda")
```

# References






